import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

def clean_text(text):
    """清理文本，保留标点符号和语气词"""
    if pd.isna(text):
        return ""
    # 去除多余的空格
    text = re.sub(r'\s+', ' ', str(text).strip())
    return text

def clean_text_for_topic(text):
    """为主题建模清理文本，去除标点符号"""
    if pd.isna(text):
        return ""
    # 去除多余的空格
    text = re.sub(r'\s+', ' ', str(text).strip())
    # 去除特殊字符
    text = re.sub(r'[^\w\s]', ' ', text)
    return text

def get_text_features(text):
    """获取文本特征"""
    if pd.isna(text) or text == "":
        return 0, 0
    
    # 计算基本特征
    words = text.split()
    word_count = len(words)
    char_count = len(text)
    
    return word_count, char_count

def analyze_sentiment(text):
    """详细的情感分析"""
    if pd.isna(text) or text == "":
        return 0, 0, 0, "neutral"
    
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # 计算情感强度（绝对值）
    intensity = abs(polarity)
    
    # 确定情感类别
    if polarity > 0.1:
        sentiment = "positive"
    elif polarity < -0.1:
        sentiment = "negative"
    else:
        sentiment = "neutral"
        
    return polarity, subjectivity, intensity, sentiment

# 情感词典扩充
moral_emotion_words = {
    'anger': ['angry', 'furious', 'outraged', 'annoyed', 'irritated', 'mad', 'anger', 'rage', 'fury', 'irate', 'hostile', 'hate', 'hateful', 'indignant', 'offended', 'resentful', 'outrage', 'insulted', 'affronted', 'violated', 'wronged'],
    'shame': ['ashamed', 'embarrassed', 'humiliated', 'shame', 'shameful', 'disgrace', 'dishonor', 'mortified', 'inferior', 'inadequate', 'worthless', 'defective', 'incompetent', 'unworthy'],
    'disgust': ['disgusted', 'repulsed', 'revolted', 'appalled', 'disgust', 'disgusting', 'repulsive', 'repugnant', 'abhorrent', 'detestable'],
    'contempt': ['contempt', 'disdain', 'scorn', 'despise', 'contemptuous', 'disdainful', 'scornful', 'despicable', 'beneath'],
    'guilt': ['guilty', 'remorse', 'regret', 'apologetic', 'culpable', 'blame', 'blamed', 'fault', 'responsible', 'wrong', 'sorry', 'apology', 'mistake', 'error', 'transgression'],
    'disappointment': ['disappointed', 'letdown', 'dissatisfied', 'disappointment', 'dissatisfaction', 'failed', 'failure', 'unsatisfactory', 'unfulfilled', 'unmet'],
    'concern': ['concerned', 'worried', 'anxious', 'troubled', 'concern', 'worry', 'anxiety', 'apprehension', 'uneasy', 'disturbed']
}

def analyze_moral_emotions(text):
    """分析道德情感"""
    if pd.isna(text) or text == "":
        return {emotion: 0 for emotion in moral_emotion_words.keys()}
    
    text = str(text).lower()
    # 使用正则表达式分词，保留完整的词
    words = re.findall(r'\b\w+\b', text)
    
    emotion_counts = {emotion: 0 for emotion in moral_emotion_words.keys()}
    
    for word in words:
        for emotion, emotion_words in moral_emotion_words.items():
            if word in emotion_words:
                emotion_counts[emotion] += 1
    
    return emotion_counts

# 读取数据
print("正在读取数据...")
df = pd.read_excel('ECSIR text.xlsx')

# 清理文本
print("正在清理文本...")
df['cleaned_english_text'] = df['ECSiR-English Text'].apply(clean_text)
df['cleaned_english_text_for_topic'] = df['ECSiR-English Text'].apply(clean_text_for_topic)

# 提取文本特征
print("正在提取文本特征...")
features = df['cleaned_english_text'].apply(get_text_features)
df['word_count'] = [f[0] for f in features]
df['char_count'] = [f[1] for f in features]

# 进行情感分析（使用原始文本）
print("正在进行情感分析...")
sentiments = df['ECSiR-English Text'].apply(analyze_sentiment)
df['sentiment_polarity'] = [s[0] for s in sentiments]
df['sentiment_subjectivity'] = [s[1] for s in sentiments]
df['sentiment_intensity'] = [s[2] for s in sentiments]
df['sentiment_category'] = [s[3] for s in sentiments]

# 分析道德情感（使用原始文本）
print("正在分析道德情感...")
moral_emotions = df['ECSiR-English Text'].apply(analyze_moral_emotions)
for emotion in moral_emotion_words.keys():
    df[f'emotion_{emotion}'] = [emotions[emotion] for emotions in moral_emotions]

# 打印情感分析结果
print("\n情感分析统计结果：")
print("\n1. 情感极性分布：")
sentiment_dist = df['sentiment_category'].value_counts()
print(sentiment_dist)
print("\n平均情感极性：", df['sentiment_polarity'].mean())

# 添加情感极性的详细统计
polarity_stats = df['sentiment_polarity'].describe()
print("\n情感极性详细统计：")
print(f"最小值：{polarity_stats['min']:.3f}")
print(f"最大值：{polarity_stats['max']:.3f}")
print(f"标准差：{polarity_stats['std']:.3f}")
print(f"中位数：{polarity_stats['50%']:.3f}")

print("\n2. 情感强度统计：")
intensity_stats = df['sentiment_intensity'].describe()
print(intensity_stats)

# 添加情感强度的文字总结
print("\n情感强度分布特征：")
print(f"平均情感强度：{intensity_stats['mean']:.3f}")
print(f"中等强度（中位数）：{intensity_stats['50%']:.3f}")
print(f"高强度（75%分位数）：{intensity_stats['75%']:.3f}")
print(f"最高强度：{intensity_stats['max']:.3f}")

# 计算高强度文本的比例（强度>0.2的比例）
high_intensity_ratio = (df['sentiment_intensity'] > 0.2).mean() * 100
print(f"\n高强度文本（强度>0.2）占比：{high_intensity_ratio:.1f}%")

print("\n3. 主要道德情感分布：")
emotion_sums = {emotion: df[f'emotion_{emotion}'].sum() for emotion in moral_emotion_words.keys()}
total_emotions = sum(emotion_sums.values())
if total_emotions > 0:
    emotion_percentages = {emotion: count/total_emotions*100 for emotion, count in emotion_sums.items()}
    for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
        print(f"{emotion}: {percentage:.1f}%")

# 创建情感分析可视化
print("\n正在创建情感分析可视化...")
plt.figure(figsize=(15, 12))

# 1. 情感极性分布
plt.subplot(2, 2, 1)
plt.hist(df['sentiment_polarity'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution of Sentiment Polarity')
plt.xlabel('Polarity (Negative -> Positive)')
plt.ylabel('Frequency')

# 2. 情感强度分布
plt.subplot(2, 2, 2)
plt.hist(df['sentiment_intensity'], bins=30, color='lightgreen', edgecolor='black')
plt.title('Distribution of Sentiment Intensity')
plt.xlabel('Intensity')
plt.ylabel('Frequency')

# 3. 情感类别分布
plt.subplot(2, 2, 3)
sentiment_counts = df['sentiment_category'].value_counts()
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Sentiment Categories')

# 4. 道德情感分布
plt.subplot(2, 2, 4)
# 计算情感占比并排序
emotion_percentages = {emotion: count/total_emotions*100 for emotion, count in emotion_sums.items()}
sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
emotions, percentages = zip(*sorted_emotions)

plt.bar(emotions, percentages)
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Moral Emotions')
plt.ylabel('Percentage (%)')
plt.tight_layout()

plt.savefig('detailed_sentiment_analysis.png', dpi=300, bbox_inches='tight')
plt.close()

# 添加AMJ风格的说明
plt.figure(figsize=(15, 10))
plt.text(0.5, 0.5, 
         'Figure 2. Detailed sentiment analysis of ECSIR descriptions (N = 336).\n\n'
         'Panel A (top left): Distribution of sentiment polarity scores, showing the emotional valence of descriptions.\n'
         'Panel B (top right): Distribution of sentiment intensity, indicating the strength of emotional expression.\n'
         'Panel C (bottom left): Proportion of positive, negative, and neutral sentiments in descriptions.\n'
         'Panel D (bottom right): Distribution of specific moral emotions across all descriptions, revealing the predominant emotional themes in reporting irresponsible behaviors.',
         ha='center', va='center', wrap=True)
plt.axis('off')
plt.savefig('sentiment_analysis_caption.png', dpi=300, bbox_inches='tight')
plt.close()

# 为LDA准备文档（使用去除标点的文本）
print("\n正在准备LDA分析...")
# 创建文档-词语矩阵，增加特征数量以捕获更多细节
n_features = 2000
tf_vectorizer = CountVectorizer(max_features=n_features,
                               stop_words='english',
                               max_df=0.90,  # 降低阈值以保留更多特征词
                               min_df=3)     # 略微提高最小词频以减少噪音
tf = tf_vectorizer.fit_transform(df['cleaned_english_text_for_topic'])

# 训练LDA模型，增加主题数量并调整参数
print("正在训练LDA模型...")
n_topics = 7  # 增加主题数量以更好地区分不同类型
lda = LatentDirichletAllocation(n_components=n_topics,
                               max_iter=20,   # 增加迭代次数
                               learning_method='online',
                               learning_offset=50.,
                               random_state=42,
                               doc_topic_prior=0.1,  # 调整先验参数使主题更集中
                               topic_word_prior=0.01)  # 使词语分布更集中

# 获取主题-词语分布
topic_word = lda.fit_transform(tf)

# 获取特征名称（词语）
feature_names = tf_vectorizer.get_feature_names_out()

# 打印主题
print("\n发现的主题：")
n_top_words = 15  # 增加显示的关键词数量
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-n_top_words-1:-1]
    top_words = [feature_names[i] for i in top_words_idx]
    top_weights = [topic[i] for i in top_words_idx]
    
    print(f"\n主题 {topic_idx + 1}:")
    for word, weight in zip(top_words, top_weights):
        print(f"{word}: {weight:.3f}")

# 计算每个主题的平均权重
avg_topic_distribution = topic_word.mean(axis=0)
print("\n各主题的平均权重：")
for topic_idx, weight in enumerate(avg_topic_distribution):
    print(f"主题 {topic_idx + 1}: {weight:.3f}")

# 为每个主题找出最具代表性的文档
print("\n每个主题的典型文档示例：")
for topic_idx in range(n_topics):
    # 获取该主题权重最高的文档索引
    top_doc_idx = topic_word[:, topic_idx].argsort()[-1]
    print(f"\n主题 {topic_idx + 1} 的典型文档:")
    print(df['ECSiR-English Text'].iloc[top_doc_idx][:200] + "...")  # 只显示前200个字符

# 保存处理后的数据
print("\n正在保存处理后的数据...")
# 添加主题分布到DataFrame
for topic_idx in range(n_topics):
    df[f'Topic_{topic_idx + 1}_Weight'] = topic_word[:, topic_idx]

df.to_excel('ECSIR_text_processed_with_topics.xlsx', index=False)
print("处理后的数据已保存到 'ECSIR_text_processed_with_topics.xlsx'")

# 创建主题分布可视化
print("\n正在创建可视化...")
plt.figure(figsize=(12, 6))
plt.bar(range(1, n_topics + 1), avg_topic_distribution)
plt.title('Average Topic Distribution')
plt.xlabel('Topic Number')
plt.ylabel('Average Weight')
plt.savefig('topic_distribution.png')
plt.close()

# 创建热力图显示主题-词语关系
plt.figure(figsize=(15, 10))
word_importance = pd.DataFrame(
    lda.components_,
    columns=feature_names,
    index=[f'Topic {i+1}' for i in range(n_topics)]
)
# 选择每个主题最重要的15个词
top_words_per_topic = []
for topic in range(n_topics):
    top_words = word_importance.iloc[topic].nlargest(15).index.tolist()
    top_words_per_topic.extend(top_words)
top_words_per_topic = list(set(top_words_per_topic))  # 去重
word_importance_subset = word_importance[top_words_per_topic]

sns.heatmap(word_importance_subset, cmap='YlOrRd', annot=False)
plt.title('Topic-Word Importance Heatmap')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('topic_word_heatmap.png')
plt.close()

print("\n分析完成！")